import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from warnings import filterwarnings
filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
data = pd.read_csv("C:\\Users\\laxma\\Downloads\\diabetes.csv")
data
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
data.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
data.tail()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
data.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
data.duplicated().sum()
0
data.Insulin.sum()
61286
data.Insulin
0 0
1 0
2 0
3 94
4 168
...
763 180
764 0
765 112
766 0
767 0
Name: Insulin, Length: 768, dtype: int64
data.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
dtype='object')
#VISUALIZATION
plt.bar(data['Outcome'],data['Age'])
plt.xticks(rotation=90)
plt.show()
fig=px.bar(data,x='Pregnancies',y='Outcome',color='Pregnancies')
fig.show()
fig=px.violin(data,x='SkinThickness',y='BMI',color='SkinThickness')
fig.show()
plt.figure(figsize=(10,4))
sns.countplot(x='Outcome', data=data, color='b')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
top_car = data['DiabetesPedigreeFunction'].value_counts().nlargest(10)
sns.countplot(y=data.DiabetesPedigreeFunction, order=top_car.index, color='red')
<AxesSubplot:xlabel='count', ylabel='DiabetesPedigreeFunction'>
sns.lineplot(x='BloodPressure', y='Age', data=data).set_title('Variation of Glucose with BMI')
Text(0.5, 1.0, 'Variation of Glucose with BMI')
sns.barplot(data['Outcome'],data['SkinThickness'],color='r')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='BMI', y='Age')
plt.title('BMI and there Age')
plt.xlabel('BMI')
plt.ylabel('Age')
plt.show()
sns.displot(data["Age"])
<seaborn.axisgrid.FacetGrid at 0x24d89f93910>
sns.boxplot(x='BloodPressure',y='Glucose',data=data)
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]),
[Text(0, 0, '0'),
Text(1, 0, '24'),
Text(2, 0, '30'),
Text(3, 0, '38'),
Text(4, 0, '40'),
Text(5, 0, '44'),
Text(6, 0, '46'),
Text(7, 0, '48'),
Text(8, 0, '50'),
Text(9, 0, '52'),
Text(10, 0, '54'),
Text(11, 0, '55'),
Text(12, 0, '56'),
Text(13, 0, '58'),
Text(14, 0, '60'),
Text(15, 0, '61'),
Text(16, 0, '62'),
Text(17, 0, '64'),
Text(18, 0, '65'),
Text(19, 0, '66'),
Text(20, 0, '68'),
Text(21, 0, '70'),
Text(22, 0, '72'),
Text(23, 0, '74'),
Text(24, 0, '75'),
Text(25, 0, '76'),
Text(26, 0, '78'),
Text(27, 0, '80'),
Text(28, 0, '82'),
Text(29, 0, '84'),
Text(30, 0, '85'),
Text(31, 0, '86'),
Text(32, 0, '88'),
Text(33, 0, '90'),
Text(34, 0, '92'),
Text(35, 0, '94'),
Text(36, 0, '95'),
Text(37, 0, '96'),
Text(38, 0, '98'),
Text(39, 0, '100'),
Text(40, 0, '102'),
Text(41, 0, '104'),
Text(42, 0, '106'),
Text(43, 0, '108'),
Text(44, 0, '110'),
Text(45, 0, '114'),
Text(46, 0, '122')])
sns.violinplot(x='Pregnancies',y='Age',data=data)
<AxesSubplot:xlabel='Pregnancies', ylabel='Age'>
#MODEL BUILDING
X = data.iloc[:,0:8]
y = data.iloc[:,8]
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=4)
sc = StandardScaler()
Xtr = sc.fit_transform(Xtr)
Xte = sc.fit_transform(Xte)
clf=KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')
clf.fit(Xtr,ytr)
pred = clf.predict(Xte)
print(confusion_matrix(pred, yte))
[[88 25] [14 27]]
print(accuracy_score(pred, yte))
0.7467532467532467